################################################
###  Downloading necessary support packages  ###
###   to run the R code                      ###
################################################

install.packages(c("tree", "ISLR"))
library(tree)
library(ISLR)

################################################
###"Carseats" is a built-in data set inside R###
### A data set with 400 observations about   ###
### the sales of child car seats. It has 11  ###
### variables or characteristics.            ###
### Source: rdrr.io/cran/ISLR/Carseats.html  ###
################################################

attach(Carseats)
###  Checking the data set  ###
dim(Carseats)     ### number of rows & columns  
head(Carseats)    ### first 6 lines of the data
tail(Carseats)    ### last 6 lines of the data

###################################################
### Analysis objective: what factors contribute ###
### to the increased sales of child car seats   ###
### Use classification tree to predict high     ###
### sales of car seats.                         ###
###################################################

### Creating a new variable to indicate 'high'  ###
### & 'low' sales based on the existing variable###
### 'Sales'. If sales is less than 8000, then it###
### is a low sale otherwise sale is high.       ###

High=factor(ifelse(Sales<=8,"No","Yes"))

### Add the new variable 'High' to the Carseats data ###
Carseats=data.frame(Carseats,High)

### Check to make sure everything worked  ###
dim(Carseats)       
head(Carseats)    
tail(Carseats)

### Fitting a classification tree on the Carseats data   ###
### Splitting the data randomly into two parts: training ###
### & test sets. 
### Classification tree is developed using the training set ###
### Test set is used to verify the prediction       ###
### Using a seed guarantees the same result always      ###
set.seed(123) 

## Training set is created by randomly selecting ###
### 200 obs. from the Carseats data ###
train=sample(1:nrow(Carseats), 200)

## Test set is created using the remaining 200 obs.###
Carseats.test=Carseats[-train,]
High.test=High[-train]

### Fitting the classification tree on the training set ###
tree.carseats=tree(High~.-Sales,Carseats,subset=train)

summary(tree.carseats)
plot(tree.carseats)
text(tree.carseats,pretty=0)

### "Shelve location", "Age", and "Price" are the ###
### 3 most important indicators of high sales     ###

### Prediction of the test data by 'tree.carseats' ###
tree.pred=predict(tree.carseats,Carseats.test,type="class")

### Comparing the prediction with actual class labels ###
table(tree.pred,High.test)

### Compute the accuracy rate ###

### Performing Cross-validation and pruning ###
### to improve prediction quality. Both the  ###
### processes were guided by classification error ### 

set.seed(456)
cv.carseats=cv.tree(tree.carseats,FUN=prune.misclass)
names(cv.carseats)
cv.carseats
cv.carseats$size ## 'size' gives the number of terminal nodes
cv.carseats$dev  ## 'dev' corresponds to cross-validation error rate

## The tree with 10 terminal nodes has the lowest CV error of 62.

## Pruning the tree
prune.carseats=prune.misclass(tree.carseats,best=10)
plot(prune.carseats)
text(prune.carseats,pretty=0)

## Check to see if pruning improved results
tree.pred=predict(prune.carseats,Carseats.test,type="class")
table(tree.pred,High.test)